from bigbang import plot
import numpy as np
import networkx as nx
import matplotlib.pyplot as plt


  • move most of this to the libraries
  • order groups reliably by time
    • in data
    • in plot
  • summarize groups
  • convolve the time series with a declining 'response time'

%matplotlib inline

from bigbang.archive import Archive

arx = Archive("ipython-dev",archive_dir="../archives")

Get the activity of a list

act = arx.get_activity(resolved=True)

Since are going to be computing correlations between N different time series data sets and that's an $O(N^2)$ operation, let's limit N.

cutoff = 5

def filtered_participants(cutoff):
    xc = act.sum() > cutoff
    return act.columns[xc]


Index([u'kevin.buchs@gmail.... (Kevin Buchs)',
       u'mantegazza at (=?iso-8859-15?q?Fr=E9d=E9ric_Mantegazza?=)',
       u'takowl@gmail.... (Thomas Kluyver)',
       u'robert.kern@gmail.... (Robert Kern)',
       u'cschin@infoecho.... (Chen-Shan Chin)',
       u'viktor.ransmayr at (Viktor Ransmayr)',
       u'ronena@gmail.... (Ronen Abravanel)',
       u'tarun.gaba7@gmail.... (TARUN GABA)',
       u'dineshvadhia@outlook.... (Dinesh Vadhia)',
       u'rgbkrk@gmail.... (Kyle Kelley)'],

from scipy.stats.stats import pearsonr  

fc = filtered_participants(cutoff)

n = len(fc)

pc = np.zeros([n,n])

for i in range(0,n):
    for j in range(i,n):
        # since pearson correlation assumes normally distributed data
        # and we believe activity to be log-normally distributed (see Shalizi...)
        # we'll take the log of values here
        si = np.log1p(act[fc[i]])
        sj = np.log1p(act[fc[j]])

        c = pearsonr(si,sj)[0]
        pc[i,j] = c
        pc[j,i] = c

array([[ 1.        , -0.0038632 ,  0.07482248, ..., -0.00196935,
        -0.00261331, -0.00399173],
       [-0.0038632 ,  1.        , -0.02827716, ..., -0.00345979,
        -0.00459112, -0.00701276],
       [ 0.07482248, -0.02827716,  1.        , ...,  0.05417618,
         0.09258522,  0.0468503 ],
       [-0.00196935, -0.00345979,  0.05417618, ...,  1.        ,
         0.0439154 , -0.00357491],
       [-0.00261331, -0.00459112,  0.09258522, ...,  0.0439154 ,
         1.        , -0.00474387],
       [-0.00399173, -0.00701276,  0.0468503 , ..., -0.00357491,
        -0.00474387,  1.        ]])

G = nx.Graph(pc)
labels = dict(enumerate(fc))
G = nx.relabel_nodes(G,labels)

# order nodes by number of messages sent by each participant
o = list(act[fc].sum().order(ascending=False).keys())

from sklearn import cluster

n_clusters = 2
sc = cluster.SpectralClustering(n_clusters=n_clusters)
partition = sc.fit_predict(pc)
partition_order = [fc[x] for x in np.argsort(partition)]

Modularity metric from

Zanetti, M. and Schweitzer, F. 2012. "A Network Perspective on Software Modularity" ARCS Workshops 2012, pp. 175-186. :

  • $e_{ij}$ is fraction of edges in network that connect nodes from module $i$ to nodes from module $j$
  • $a_i = \sum_{j}^{n} e_{ij}$, $b_i = \sum_{j}^{n} e_{ji}$
$$Q = \frac{\sum_{i}^{n} e_{ii} - \sum_{i}^{n} a_{i}b_{i}}{1 - \sum_{i}^{n} a_{i}b_{i}} $$

Here we are trying to adapt this to edges weighted from $(0,1]$

from bigbang.process import modularity

def get_colors(num):

def ksc(m,k=3):
    Perform spectral clustering on matrix finding the number of
    clusters that optimizes modularity.
    sc = cluster.SpectralClustering(n_clusters=k)
    partition = sc.fit_predict(pc)
    return partition

partition = ksc(pc,k=3)

array([0, 0, 1, 0, 0, 0, 0, 1, 0, 1, 2, 0, 0, 0, 1, 1, 0, 1, 0, 0, 1, 0, 1,
       0, 1, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 1, 1, 0, 0,
       0, 1, 1, 1, 1, 1, 0, 1, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 1, 1,
       0, 0, 0, 1, 0, 0, 2, 0, 0, 1, 1, 0, 1, 0, 0, 1, 0, 0, 0, 1, 1, 0, 0,
       1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 1, 0, 0, 0,
       1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0,
       1, 0, 0, 1, 0, 0, 0, 1, 1, 0, 0, 0, 1, 1, 0, 0, 1, 0, 1, 0, 1, 0, 2,
       0, 0, 0, 1, 1, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0,
       1, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0,
       1, 1, 0, 1, 0, 0, 1, 1, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 1, 1, 0], dtype=int32)

partition_order = [fc[x] for x in np.argsort(partition)]

array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2], dtype=int32)

from collections import defaultdict

p = {}
parts = {}
p_n = {}
mods = []

for k in range(2,20):
    p[k] = ksc(pc,k=k)
    parts[k] = defaultdict(list)
    p_n[k] = defaultdict(list)
    for i in range(len(p[k])):
    #print [len(x) for x in p_n[k].values()]

    mod = modularity(pc,p_n[k].values())
    #print mod

print mods

[0.68273829865586444, 0.73119213153041951, 0.61508542073140959, 0.5789327303452666, 0.55223802103348796, 0.58206762553632729, 0.58431430259624972, 0.59199558260411789, 0.59556101446568765, 0.62098559686576504, 0.63134826966808533, 0.62482516308819835, 0.59505523519905168, 0.61695124241799348, 0.65404938597260365, 0.61714630005557225, 0.62284259062970893, 0.60026039039551249]
k = np.argmax(mods) + 2
partition_order = [fc[x] for x in np.argsort(p[k])]

print k


for x,v in parts[k].items():
    print str(x) + ": " + str(len(v))

0: 155
1: 3
2: 70

node_parts = {}

for x,v in parts[k].items():
    for n in v:
        node_parts[n] = int(x)

import networkx as nx

#FG = nx.Graph(pc > .01)
FG = nx.from_numpy_matrix(pc * (pc > 0))

labels = dict(enumerate(fc))
FG = nx.relabel_nodes(FG,labels)

pos = nx.spring_layout(FG, k = 0.6, iterations = 1000)

from bigbang import plot


